import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)
import plotly.express as px
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import HTML
import plotly.graph_objects as go
from geopy.geocoders import Nominatim
from matplotlib import gridspec
def jupyter_settings():
%matplotlib inline
%pylab inline
plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24
display( HTML( '<style>.container { width:100% !important; }</style>') )
sns.set()
jupyter_settings()
%pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
data = pd.read_csv('../data/kc_house_data.csv')
data.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.00 | 3 | 1.00 | 1180 | 5650 | 1.00 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.51 | -122.26 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.00 | 3 | 2.25 | 2570 | 7242 | 2.00 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.72 | -122.32 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.00 | 2 | 1.00 | 770 | 10000 | 1.00 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.74 | -122.23 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.00 | 4 | 3.00 | 1960 | 5000 | 1.00 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.52 | -122.39 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.00 | 3 | 2.00 | 1680 | 8080 | 1.00 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.62 | -122.05 | 1800 | 7503 |
5 rows × 21 columns
data.shape
(21613, 21)
data.dtypes
id int64 date object price float64 bedrooms int64 bathrooms float64 sqft_living int64 sqft_lot int64 floors float64 waterfront int64 view int64 condition int64 grade int64 sqft_above int64 sqft_basement int64 yr_built int64 yr_renovated int64 zipcode int64 lat float64 long float64 sqft_living15 int64 sqft_lot15 int64 dtype: object
data['date'] = pd.to_datetime(data['date'])
data['floors'] = data['floors'].astype(int)
data.isna().sum()
id 0 date 0 price 0 bedrooms 0 bathrooms 0 sqft_living 0 sqft_lot 0 floors 0 waterfront 0 view 0 condition 0 grade 0 sqft_above 0 sqft_basement 0 yr_built 0 yr_renovated 0 zipcode 0 lat 0 long 0 sqft_living15 0 sqft_lot15 0 dtype: int64
cat_attributes = data[['view', 'condition', 'waterfront', 'grade']]
num_attributes = data.drop(['view', 'condition', 'waterfront', 'grade'], axis = 1 )
num_attributes.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 21613.00 | 4580301520.86 | 2876565571.31 | 1000102.00 | 2123049194.00 | 3904930410.00 | 7308900445.00 | 9900000190.00 |
| price | 21613.00 | 540088.14 | 367127.20 | 75000.00 | 321950.00 | 450000.00 | 645000.00 | 7700000.00 |
| bedrooms | 21613.00 | 3.37 | 0.93 | 0.00 | 3.00 | 3.00 | 4.00 | 33.00 |
| bathrooms | 21613.00 | 2.11 | 0.77 | 0.00 | 1.75 | 2.25 | 2.50 | 8.00 |
| sqft_living | 21613.00 | 2079.90 | 918.44 | 290.00 | 1427.00 | 1910.00 | 2550.00 | 13540.00 |
| sqft_lot | 21613.00 | 15106.97 | 41420.51 | 520.00 | 5040.00 | 7618.00 | 10688.00 | 1651359.00 |
| floors | 21613.00 | 1.45 | 0.55 | 1.00 | 1.00 | 1.00 | 2.00 | 3.00 |
| sqft_above | 21613.00 | 1788.39 | 828.09 | 290.00 | 1190.00 | 1560.00 | 2210.00 | 9410.00 |
| sqft_basement | 21613.00 | 291.51 | 442.58 | 0.00 | 0.00 | 0.00 | 560.00 | 4820.00 |
| yr_built | 21613.00 | 1971.01 | 29.37 | 1900.00 | 1951.00 | 1975.00 | 1997.00 | 2015.00 |
| yr_renovated | 21613.00 | 84.40 | 401.68 | 0.00 | 0.00 | 0.00 | 0.00 | 2015.00 |
| zipcode | 21613.00 | 98077.94 | 53.51 | 98001.00 | 98033.00 | 98065.00 | 98118.00 | 98199.00 |
| lat | 21613.00 | 47.56 | 0.14 | 47.16 | 47.47 | 47.57 | 47.68 | 47.78 |
| long | 21613.00 | -122.21 | 0.14 | -122.52 | -122.33 | -122.23 | -122.12 | -121.31 |
| sqft_living15 | 21613.00 | 1986.55 | 685.39 | 399.00 | 1490.00 | 1840.00 | 2360.00 | 6210.00 |
| sqft_lot15 | 21613.00 | 12768.46 | 27304.18 | 651.00 | 5100.00 | 7620.00 | 10083.00 | 871200.00 |
data2 = data.copy()
data2['standard'] = data2['price'].apply(lambda x: 'high_standard' if x >= 540000 else 'standard')
data2['house_age'] = data2['date'].apply(lambda x: "new_house" if x >= pd.to_datetime('2014/01/01', format = "%Y/%m/%d") else "old_house")
data2['dormitory_type'] = data2['bedrooms'].apply(lambda x: "studio" if x == 1 else "apartment" if x == 2 else "house" if x > 2 else "NA")
data2['condition_type'] = data2['condition'].apply(lambda x: "bad" if x <= 2 else "regular" if ( x == 3 ) | ( x == 4) else "good" )
data2['year'] = data2['date'].dt.year
data2['week'] = data2['year'].astype(str) + " " + (data2['date'].dt.isocalendar().week).astype(str)
data2['level'] = data2['price'].apply(lambda x: 0 if x < 321950 else
1 if (x >= 321950) & (x < 450000) else
2 if (x >= 450000) & (x < 645000) else
3
)
data3 = data2.copy()
data3 = data3[data3['bedrooms'] < 33]
data4 = data3.copy()
report = data4[['id', 'date','bedrooms','sqft_lot','price','standard']].sort_values('price', ascending = False).reset_index()
report.head()
| index | id | date | bedrooms | sqft_lot | price | standard | |
|---|---|---|---|---|---|---|---|
| 0 | 7252 | 6762700020 | 2014-10-13 | 6 | 27600 | 7700000.00 | high_standard |
| 1 | 3914 | 9808700762 | 2014-06-11 | 5 | 37325 | 7062500.00 | high_standard |
| 2 | 9254 | 9208900037 | 2014-09-19 | 6 | 31374 | 6885000.00 | high_standard |
| 3 | 4411 | 2470100110 | 2014-08-04 | 5 | 35069 | 5570000.00 | high_standard |
| 4 | 1448 | 8907500070 | 2015-04-13 | 5 | 23985 | 5350000.00 | high_standard |
report.to_csv('../data/report_1.csv', index = False)
plot_ano = data4.groupby('date')['price'].mean().reset_index()
plt.rcParams["figure.figsize"] = (25,8)
plt.title("Price change over time")
sns.lineplot(x = data4['date'], y = data4['price'])
<AxesSubplot: title={'center': 'Price change over time'}, xlabel='date', ylabel='price'>
data1_week = data4.sort_values('week')
plt.title("Price change over weeks")
plt.xticks(fontsize = 7, rotation = 90)
sns.barplot(x = data1_week['week'], y = data1_week['price'])
<AxesSubplot: title={'center': 'Price change over weeks'}, xlabel='week', ylabel='price'>
plt.figure(figsize = (8,5) )
sns.histplot(data = data4, x = 'bedrooms')
<AxesSubplot: xlabel='bedrooms', ylabel='Count'>
preco_ano = data4[['yr_built','price']].groupby('yr_built').mean().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x=preco_ano['yr_built'], y=preco_ano['price'],
line_shape='spline'))
fig.update_layout(title_text='AVG Price by YR Built',title_x=0.5,title_y=0.9, width = 800, height = 600)
media_dorm = data4[['dormitory_type','price']].groupby('dormitory_type').mean().reset_index()
fig = px.bar(media_dorm, y = 'price', x = 'dormitory_type', text_auto = '.2s', width = 800, height = 500)
fig.update_layout(title_text='Preço por tipo de dormitório',title_x=0.5,title_y=0.95,width = 800, height = 600)
fig.update_traces(textfont_size=12)
fig.show()
media_renovacao = data4[['yr_renovated','price']].groupby('yr_renovated').mean().reset_index()
media_renovacao = media_renovacao[media_renovacao['yr_renovated']>=1930]
fig = go.Figure()
fig.add_trace(go.Scatter(x=media_renovacao['yr_renovated'], y=media_renovacao['price'],
line_shape='spline'))
fig.update_layout(title_text='Price by YR Renovated',title_x=0.5,title_y=0.9, width = 800, height = 600)
media_ano_dorms = data4[['yr_built','dormitory_type','price']]
media_ano_dorms = pd.pivot_table(media_ano_dorms, values='price', index=['yr_built'], columns=['dormitory_type'], aggfunc=np.sum).fillna(0).reset_index().groupby('yr_built').sum()
media_ano_dorms[['NA','apartment','house', 'studio']] = media_ano_dorms[['NA','apartment','house', 'studio']].mask(media_ano_dorms[['NA','apartment','house', 'studio']] > 300000000, 30000000)
fig = px.imshow(media_ano_dorms.values,
labels=dict(x="dormitory_type", y="yr_built", color="Price"),
x = media_ano_dorms.columns,
y = media_ano_dorms.index,
color_continuous_scale='brwnyl'
)
fig.update_xaxes(side="top")
fig.update_layout(width = 800, height = 800)
fig.show()
plt.figure(figsize = (8,5) )
sns.histplot(data = data4, x = 'level', )
<AxesSubplot: xlabel='level', ylabel='Count'>
data = pd.read_csv('../data/kc_house_data.csv')
houses = data[['id','lat','long','price']].copy()
# Define level
houses['level'] = houses['price'].apply(lambda x: 0 if x < 321950 else
1 if (x >= 321950) & (x < 450000) else
2 if (x >= 450000) & (x < 645000) else
3
)
houses['level'] = houses['level'].astype( int )
fig = px.scatter_mapbox( houses,
lat = 'lat',
lon = 'long',
color = 'level',
size = 'price',
color_continuous_scale = px.colors.cyclical.IceFire,
size_max = 15,
zoom = 10 )
fig.update_layout( mapbox_style = 'open-street-map' )
fig.update_layout( height = 800, margin = {'r':0, 't':0, 'l':0, 'b':0})
#fig.show()
fig.write_html("../results/kc_house_map.html")
fig.show()